__kernel void kernel_naive_transpose(__global const float4 * i_data,
        __global float4 * o_data,
        int rows, int cols)
{
    uint x = get_global_id(0);
    uint y = get_global_id(1);
	if (x < rows && y < cols)
		//o_data[x*cols + y] = convert_float4(5.0f);
		o_data[x*cols + y] = i_data[y*rows + x];
}